olcude profiling experiments on the rodinia OpenCL benchmark suite¶In this notebook, we are going to gather/present general info regarding our experiments using the oclude tool for standalone OpenCL kernel execution and profiling (see here).
The experiments have already taken place (they will not be conducted in this notebook) and the results have been stored in the form of JSON data as text files. Special helper functions have been written to parse these files and turn them into Python-native dictionaries.
Please note that in this notebook, we treat all data in the same way, regardless of the kernel the execution of which produced them. This way, we hope to gain insight on the rodinia benchmark suite and the way OpenCL kernels are written and the behavior they demonstrate in general.
%%capture
!pip install --upgrade numpy
!pip install --upgrade scipy
!pip install --upgrade pandas
!pip install --upgrade plotly
!pip install --upgrade matplotlib
!pip install --upgrade seaborn
### all imports in this cell ###
import os
import itertools
import numpy as np
from scipy import stats
import pandas as pd
import seaborn as sb
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from IPython.display import display, HTML
from pprint import pprint
%matplotlib inline
### some globals ###
outdir = 'outputs'
# some helper functions, do not bother with them
def sorted_filenames_keyfunc(filename):
return int(filename.split('__')[3].split('.')[0])
def get_grouped_all_profilings_filenames():
def groupby_keyfunc(filename):
return filename.split('__')[:3]
sorted_filenames = sorted(os.listdir(outdir), key=groupby_keyfunc)
retlist = []
for k, v in itertools.groupby(sorted_filenames, groupby_keyfunc):
retlist.append((k, sorted(v, key=sorted_filenames_keyfunc)))
return retlist
def get_gsize_from_profiling_filename(filename):
return int(filename.split('__')[-1].split('.')[0])
def get_average_instcounts_dicts(arg, discard_zeros=False):
if isinstance(arg, list) or isinstance(arg, tuple):
return list(map(get_average_instcounts_dicts, arg))
from collections import Counter
from functools import reduce
from operator import add
if not isinstance(arg, dict):
arg = load_profiling_dicts(arg)
results = arg['results']
samples = len(results)
if discard_zeros:
avg_instcounts = dict(
reduce(add, map(Counter, map(lambda x : x['instcounts'], results)))
)
else:
c = Counter()
for d in map(lambda x : x['instcounts'], results):
c.update(d)
avg_instcounts = dict(c)
avg_instcounts = {
k : int(v) // samples for k, v in avg_instcounts.items()
}
if 'timeit' in results[0]:
avg_timeit = dict(
reduce(add, map(Counter, map(lambda x : x['timeit'], results)))
)
avg_timeit = {
k : v / samples for k, v in avg_timeit.items()
}
else:
avg_timeit = None
avg_dict = {}
for k in filter(lambda x : x != 'results', arg.keys()):
avg_dict[k] = arg[k]
avg_dict['results'] = dict(instcounts=avg_instcounts)
if avg_timeit:
avg_dict['results']['timeit'] = avg_timeit
return avg_dict
def get_list_of_profilings_filenames(benchmark=None, file=None, kernel=None):
if benchmark is None:
return os.listdir(outdir)
profilings_filenames = list(filter(
lambda x : x.startswith('__'.join(benchmark, file, kernel)),
os.listdir(outdir)
))
return list(map(lambda x : os.path.join(outdir, x), profilings_filenames))
def get_sorted_list_of_profilings_filenames(benchmark=None, file=None, kernel=None):
return sorted(
get_list_of_profilings_filenames(benchmark, file, kernel),
key=sorted_filenames_keyfunc
)
def load_profiling_dicts(arg, append_gsize=True):
if isinstance(arg, list) or isinstance(arg, tuple):
return list(map(load_profiling_dicts, arg))
from json import load
with open(os.path.join(outdir, arg), 'r') as f:
profiling_dict = load(f)
if append_gsize:
profiling_dict['gsize'] = get_gsize_from_profiling_filename(arg)
return profiling_dict
def remove_zeros(df):
return df.loc[:, (df != 0).any(axis=0)]
def average_correlation_heatmap(dfs):
corrs = [df.corr() for df in dfs]
average_corrs = corrs[0].fillna(0)
ns = np.zeros_like(average_corrs)
ns += (~corrs[0].isna())
for corr in corrs[1:]:
corr_no_nan = corr.fillna(0)
average_corrs += corr_no_nan
ns += (~corr.isna())
average_corrs /= ns
mask = np.triu(np.ones_like(average_corrs, dtype=np.bool))
f, ax = plt.subplots(figsize=(30, 20))
sb.heatmap(
average_corrs, annot=False, mask=mask, cmap='magma', center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5}
)
plt.show()
return average_corrs
def multi_table(table_list):
'''
Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
'''
return HTML(
'<table><tr style="background-color:white;">' +
''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
'</tr></table>'
)
def plot_2_instructions_multiple_kernels(info, dfs, figsize, INSTR_LEFT, INSTR_RIGHT):
n_kernels = len(dfs)
fig, big_axes = plt.subplots(n_kernels, 1, figsize=figsize, sharey=True)
for i, (kernel_info, big_ax) in enumerate(zip(info, big_axes)):
big_ax.set_title(
f'Benchmark: {kernel_info[0]}, File: {kernel_info[1]}, Kernel: {kernel_info[2]}',
fontsize=24
)
big_ax.tick_params(labelcolor=(1.,1.,1., 0.0), top='off', bottom='off', left='off', right='off')
big_ax._frameon = False
for i in range(2 * n_kernels):
ax = fig.add_subplot(n_kernels, 2, i + 1)
ax.set_xlabel('gsize', fontsize=16)
curr_df = dfs[i//2]
if i % 2 == 0:
ax.plot(curr_df['gsize'], curr_df[INSTR_LEFT])
ax.set_ylabel(INSTR_LEFT, fontsize=16)
else:
ax.plot(curr_df['gsize'], curr_df[INSTR_RIGHT])
ax.set_ylabel(INSTR_RIGHT, fontsize=16)
fig.set_facecolor('w')
plt.tight_layout()
plt.show()
Each kernel from the rodinia benchmark suite was treated separately, even those that were in the same file.
From these, some could not be profiled with oclude due to frequent segmentation faults, which most probably were caused by the numerical values of the input buffers. At this point, it should be reminded that oclude initializes the kernel arguments in a random way, taking into consideration the argument type only (e.g. it will never initialize an unsigned type with a value below zero). An OpenCL kernel, though, may rely on the input data meeting certain requirements, either regarding the numerical values of the input or even the buffer sizes. For example, a kernel may take for granted that its first argument has twice the size of the second. It is obvious that, in such a case, segmentation faults are to be expected and oclude, in its current development status, can not do anything about them.
From the kernels that could be profiled (i.e. their execution did not lead to -frequent- segfaults), a number of profiling samples was taken. More precisely:
-s/--samples) was kept constant and equal to 100 for all experiments described in this section.-g/--gsize). The local NDRange was not defined, which means that it was automatically and (nearly) optimally selected by the OpenCL runtime.-x/--timeout) or the size reached a value of 100000. Only the former happened.profile_opencl_kernel function of oclude. Every such profiling was stored as a JSON file for subsequent processing.A pie chart showing the behavior of the rodinia OpenCL kernels during our experiments follows. You can hover your mouse over the pie chart to learn the absolute values.
all_kernels = 58 # I just counted them :)
# either kernels that constantly end up segfaulting
# or kernels that can not be randomly profiled
segfault_kernels = 12
unprofilable_kernels = segfault_kernels + 7
# regarding profiled kernels
profiling_filenames = get_grouped_all_profilings_filenames()
relatively_slow_kernels = 0
relatively_fast_kernels = 0
for _, kernel_profiling_files in profiling_filenames:
smallest_gsize = get_gsize_from_profiling_filename(kernel_profiling_files[0])
if smallest_gsize <= 10:
relatively_slow_kernels += 1
else:
relatively_fast_kernels += 1
# plot above info
labels = 'relatively fast', 'relatively slow', '"unprofilable"'
values = relatively_fast_kernels, relatively_slow_kernels, unprofilable_kernels
colors = 'green', 'orange', 'red'
fig = go.Figure(data=[go.Pie(
labels=labels,
values=values,
marker=dict(colors=colors)
)])
fig.update_layout(
title=dict(
text='"Profilability" of rodinia OpenCL kernels',
font=dict(
family='Courier New, monospace',
size=18,
color='#7f7f7f'
),
x=.5
),
font=dict(
family='Courier New, monospace',
size=12,
color='#7f7f7f'
)
)
fig.show()
We can see that the vast majority (43.1%) of the kernels are relatively fast, meaning that we managed to perform tens or even hundreds of 100-samples profiling experiments on them. They represent the majority of the rodinia benchmark suite and, as we will see later, the execution time and instructions tend to have a relatively linear relation to the global NDRange size in these kernels.
More specifically, the following table shows how many 100-sample profilings were performed on each kernel. Note that the data below refer to the "profilable" (i.e. "relatively fast" and "relatively slow") kernels, not the "unprofilable" ones.
all_profiling_filenames = get_grouped_all_profilings_filenames()
columns = [
'rodinia benchmark', 'benchmark file', 'kernel name',
'# profilings', 'smallest gsize', 'largest gsize'
]
data = [
k + [len(v), get_gsize_from_profiling_filename(v[0]), get_gsize_from_profiling_filename(v[-1])]
for k, v in all_profiling_filenames
]
profiling_general_info_df = pd.DataFrame(
data=data,
columns=columns
)
profiling_general_info_df.index += 1
profiling_general_info_df
We can observe that another way of differentiating between "relatively fast" and "relatively slow" kernels is that the former are the ones the largest global NDRange size of which is greater that 1000, while the latter are the ones the smallest global NDRange size of which is smaller than 100:
print('"Relatively fast" kernels:',
len(profiling_general_info_df[profiling_general_info_df['largest gsize'] > 1000]))
print('"Relatively slow" kernels:',
len(profiling_general_info_df[profiling_general_info_df['smallest gsize'] < 100]))
We are now ready to investigate the relationship between the global NDRange size (henceforth gsize) and the instructions executed from an OpenCL kernel. To achieve this, we are going to use the data we acquired with the help of oclude.
It is reminded that each of the profilings of the kernels refers to a single gsize value and consists of 100 samples, taken with different random initialization of the kernel input data (i.e. the value(s) of its arguments). In that way, we are able to:
We will focus on the second approach.
In this section, we will work on all kernels and start gaining some insight on what is going on. For this section, we will work with the average executed instructions for each profiling and not for each one of the 100 measurements separately.
First of all, we will take the average instruction counts from all of the profilings we performed. We will create a list of DataFrames. Each one will correspond to a kernel and will contain the averaged executed instructions per gsize for this kernel.
# get the profiling files per kernel
grouped_profilings_filenames = get_grouped_all_profilings_filenames()
# load the dicts for all profilings
prof_dicts = [
(k, load_profiling_dicts(v))
for k, v in grouped_profilings_filenames
]
# calculate the average instcounts for all profilings
averaged_prof_dicts = [
(k, get_average_instcounts_dicts(v))
for k, v in prof_dicts
]
Now, we will create a DataFrame for each kernel, which will contain all the averaged instruction counts for every gsize value:
llvm_instructions = list(averaged_prof_dicts[0][1][0]['results']['instcounts'].keys())
averaged_prof_dfs = []
for kernel_info, prof_info in averaged_prof_dicts:
df = pd.DataFrame(columns=['gsize'] + llvm_instructions, dtype=int)
for i, sample in enumerate(prof_info):
df.loc[i] = [sample['gsize']] + list(sample['results']['instcounts'].values())
averaged_prof_dfs.append(df)
Here is an example DataFrame for a random kernel:
averaged_prof_dfs[24]
We are now ready to measure the correlation between the gsize parameter and the executed instructions. This is a very valuable insight, because it will let us know:
It is easy to do so for a single kernel (by calculating the correlation matrix of the the respective DataFrame), but it is not obvious how to gain an insight regarding the whole benchmark suite. To be more specific, it is easy to answer the following question:
How is instruction X correlated with instruction Y in kernel K?
But it is not obvious how to answer the (much more interesting) question?
How is instruction X correlated with instruction Y across all kernels?
To answer that, we will plot the average correlation matrix across all kernels. More specifically:
In this way, we will learn how each instruction is correlated with every other instruction across all kernels in average.
average_corrs = average_correlation_heatmap(averaged_prof_dfs)
From the above heatmap, it is clear that:
These are great news indeed, because we are led to the following conclusions:
Combining the above, we can draw even more interesting conclusions regarding the groups of "similarly escalating" instruction counts. For example, if such a group correlates strongly with gsize, we can safely assume that the instructions of this group scale linearly, given that gsize scales linearly.
For now, we will study the correlation between the executed instructions and gsize only, because we want to focus on our main goal which is to predict instruction counts given a gsize.
A list of the correlation of the instruction counts and gsize follows, in descending strength, along with the number of kernels in which they have appeared:
corrs_w_gsize = average_corrs['gsize'].dropna()
existent_instructions = [x for x in llvm_instructions if x in corrs_w_gsize]
indexes = ['# kernels', 'correlation with gsize']
instr_freq_df = pd.DataFrame(columns=existent_instructions, index=indexes)
for instr in existent_instructions:
n_kernels = 0
for kernel_df in averaged_prof_dfs:
# correlation of instr with gsize
corr_w_gsize = corrs_w_gsize[instr]
# number of kernels it appears in
n_kernels += int((kernel_df[instr] > 0).any())
instr_freq_df[instr] = [n_kernels, corr_w_gsize]
instr_freq_df = instr_freq_df.transpose()
print('Left: sorted by number of kernels first')
print('Right: sorted by correlation with gsize first')
multi_table([
instr_freq_df.sort_values(indexes, ascending=False),
instr_freq_df.sort_values(indexes[::-1], ascending=False)
])
We observe that all the instructions that appear in multiple kernels (i.e. they are of greater significance to us) are strongly correlated with gsize. We obviously do not need to worry about instructions like fptrunc, fpext and fneg which have quite weaker correlations with gsize (especially the later), given that they appear in 4, 4 and 1 kernel respectively, and thus are insignificant for our analysis.
We will continue with the "relatively fast" kernels. We believe that their fastness will be seen in the EDA that will follow in terms of a linear relationship between the gsize (independent variable) and the executed instructions (dependent variables).
First of all, we are going to split our DataFrames into 2 sets, consisting of the ones that correspond to the "relatively fast" kernels and of the ones that correspond to the "relatively slow" kernels.
relatively_fast_info, relatively_slow_info = [], []
relatively_fast_dfs, relatively_slow_dfs = [], []
for i, df in enumerate(averaged_prof_dfs):
prof_info = profiling_general_info_df.iloc[i]
if prof_info['largest gsize'] > 1000:
relatively_fast_info.append(
(prof_info['rodinia benchmark'], prof_info['benchmark file'], prof_info['kernel name'])
)
relatively_fast_dfs.append(df)
elif prof_info['smallest gsize'] < 100:
relatively_slow_info.append(
(prof_info['rodinia benchmark'], prof_info['benchmark file'], prof_info['kernel name'])
)
relatively_slow_dfs.append(df)
print('"relatively fast" dfs:', len(relatively_fast_dfs))
print('"relatively slow" dfs:', len(relatively_slow_dfs))
Let's see the average correlation again, but regarding the "relatively fast" kernels only:
_ = average_correlation_heatmap(relatively_fast_dfs)
Obviously excluding the fneg instruction, as well as the fptrunc and fpext instructions (which tend not to scale at all and rather stay constant in the kernels in which they exist), every other correlation between gsize and the executed instructions and between the executed instructions themselves is generally strong.
It is time to plot some of the instructions against gsize. For the sake of brevity, we will plot only the following instructions:
store global: It correlates extremely strongly with every other instruction and is very important in our analysis, given that accesses to the global OpenCL memory address space are one of the most expensive operations in terms of execution time.br: It is one of the possible instructions we can use -along call and ret- to study branching, another expensive operation. Obviously, call and ret correlate extremely strongly, given that whenever the former happened, the later must follow. We can see from the heatmap above that call, ret and br form one of the many strongly correlated groups of instruction counts. Furthermore, it can be shown that call and ret are absolutely linear functions of gsize (the corresponding diagrams are omitted for brevity). Therefore, br is "more interesting" than call and ret at this point in our analysis. We will plot store global and br for each "relatively fast" kernel.
plot_2_instructions_multiple_kernels(
relatively_fast_info, relatively_fast_dfs, (20, 200),
'store global', 'br'
)
The first observation we can make is that store global and ret are near-linear or linear functions of gsize. This is an extremely important observation regarding the construction of regression models later; it may mean that we need to focus on linear models only.
With a cooler approach, we have to record the following observations:
In some kernels, store global and/or br have zero counts.
Regarding store global, this may seem odd; what is the utility of an OpenCL kernel if it does not write data somewhere where the hostcode can retrieve them? It turns out, however, that some of them, e.g. the dwt2d kernels, are actually helper functions for other kernels, while others like the myocyte kernel use other helper functions to write to their global buffers. The store operations associated with the later behavior are categorized as store callee, because oclude can not infer the OpenCL address space of a memory operation when it occurs outside of a kernel function. Every kernel that exhibits this behavior belong to one of the two categories mentioned above.
Regarding br, only the streamcluster kernel does not record any, and this is because it actually does not include any branching. This is easily deduced by looking at its the source code:
__kernel void memset_kernel(__global char * mem_d, short val, int number_bytes){
const int thread_id = get_global_id(0);
mem_d[thread_id] = val;
}
There are relations that are absolutely linear. In fact, all relations seem to be linear, with more or less noise.
The srad benchmark exhibits some worrying behavior regarding the relation between the executed instructions we plotted and gsize. More specifically, it also follows a linear-like trend by tends to spike a lot. Could this behavior be more generalized than it seems here? How can such relations be regressed and predicted?
We will keep in mind all of the above observations when we move to the design and selection of our regression models, and we will see if similar conclusions are drawn from the "relatively slow" kernels.
We will continue with the "relatively slow" kernels, and we will hope to be able to interpret their relative slowness, which led to less profilings and/or of smaller gsizes compared to the "relatively fast" ones. We believe that their slowness will be seen in the EDA that will follow in terms of a superlinear relationship between the gsize (independent variable) and the executed instructions (dependent variables).
We will start by following the same process as with the "relatively fast" kernels. Firstly, let's see the average correlation again, but regarding the "relatively slow" kernels only:
_ = average_correlation_heatmap(relatively_slow_dfs)
We see once again that the correlation between gsize and the executed instructions, as well as between the executed instructions themselves are really strong. Therefore, the answer regarding the slowness of these kernels will probably not be found here; we expect linear or near-linear relations again.
Let's take a closer look, the way we did before:
plot_2_instructions_multiple_kernels(
relatively_slow_info, relatively_slow_dfs, (20, 200),
'store global', 'br'
)
Things are way more interesting than we expected. First of all, it should be noted that we now have way fewer profilings compared to what we had from the "relatively fast" kernels, due to their slowness. This is the main reason for the graphs not being so smooth now. But, even with these few samples, we can make some really important observations:
bfs, hotspot3D and kmeans amongst others.lud benchmark, that exhibits linear behavior. Observe, though, that we get a really big count for the br instruction - around 100000 - with gsize being as small as 130. This explains why lud, despite exhibiting a linear behavior, belongs to the "relatively slow" group.Most probably, the reason this superlinear behavior did not appear in the correlation heatmap above is the very small number of profilings.
Let's draw one last pie chart:
labels = 'relatively fast (or approximately linear)', 'relatively slow (or approximately superlinear)'
values = relatively_fast_kernels, relatively_slow_kernels
colors = 'green', 'orange'
fig = go.Figure(data=[go.Pie(
labels=labels,
values=values,
marker=dict(colors=colors)
)])
fig.update_layout(
title=dict(
text='Grouping of "profilable" rodinia OpenCL kernels',
font=dict(
family='Courier New, monospace',
size=18,
color='#7f7f7f'
),
x=.5
),
font=dict(
family='Courier New, monospace',
size=12,
color='#7f7f7f'
)
)
fig.show()
We concluded that:
Last but (certainly) not least, a secondary but extremely important conclusion is the following:
It may be the case that a single regression model won't be accurate enough for the whole benchmark suite. There will most probably be a single linear regression model that will be very accurate for the "relatively fast" kernels alone, but it is quite uncertain whether:
will (or can) be found.
As a goodbye and a thank you for staying with me until now, we will wrap up the investigative analysis with a reference to the -s/--samples flag of oclude and the importance it had in the analysis so far.
First of all, let's profile a kernel for 100 different gsizes, and for different sample sizes:
# profile the same kernel with the same parameters but for increasing number of samples and see what happens
from oclude import profile_opencl_kernel
gsizes = np.arange(50, 5001, 50, dtype=int).tolist()
samples = np.geomspace(1, 100, 3, dtype=int).tolist() + [500]
args = dict(
file = os.path.join(os.pardir, 'tests', 'rodinia_kernels', 'particlefilter', 'particle_single.cl'),
kernel = 'normalize_weights_kernel',
instcounts = True,
timeout = 0 # do not timeout
)
profs = [] # will be list of lists
for s in samples:
args['samples'] = s
profs_for_s = []
for g in gsizes:
args['gsize'] = g
profs_for_s.append(profile_opencl_kernel(**args))
profs.append(profs_for_s)
Now, let's plot the above profiling data in an interesting manner:
averaged_profs = [get_average_instcounts_dicts(prof) for prof in profs]
INST = 'load global'
fig, ax = plt.subplots(1, 4, sharex=True, sharey=True, figsize=(20, 5))
fig.text(.5, 0, 'gsize', ha='center', fontsize=16)
fig.text(0, .5, INST, va='center', rotation='vertical', fontsize=16)
for i, s in enumerate(samples):
instcounts = [x['results']['instcounts'][INST] for x in averaged_profs[i]]
ax[i].plot(gsizes, instcounts)
ax[i].set_title(str(s) + ' sample' + ('s' if s > 1 else ''))
plt.show()
We can clearly see that the more samples there are, the less influential the randomness that comes from the initialization of the kernel arguments becomes.
In the example above, it is obvious that the linear nature of the kernel is not visible at all when we get a single sample. What's more, it seems like we have to deal with greater instruction counts than in reality (i.e. there are a lot of spikes that reach values greater than 2000). But when we start to increase the number of samples (and calculate the average instruction counts), the more visible and obvious it becomes that: